Scatter plot

Concept- Scatter plot

Scatter plots are a type of data visualization used to display values for typically two continuous variables(typically, but can be used to visualize categorical variables as well) for a set of data. The data are displayed as a collection of points, each having the value of one variable determining the position on the horizontal axis and the value of the other variable determining the position on the vertical axis. Scatter plots are used to observe relationships between variables.

Properties:

  1. Axes: Represent the variables being compared. Typically, the independent variable or predictor is placed on the x-axis, and the dependent variable or response is placed on the y-axis.

  2. Points: Each point on the scatter plot represents an individual data point. The position is determined by the values of the two variables.

  3. Trend: The overall direction of the data points in the plot can suggest a relationship (e.g., positive, negative, or no correlation).

  4. Clusters: Groupings or clusters of points may indicate that those data points have something in common.

  5. Outliers: Points that lie significantly outside the general distribution of data may be highlighted as they could indicate anomalies or special cases.

# Load necessary libraries
if (!require(ggplot2)) install.packages("ggplot2")
library(ggplot2)

# Generate sample data
set.seed(42) # For reproducibility
x <- rnorm(100, mean = 50, sd = 10) # Sample x-values
y <- x * 1.5 + rnorm(100, sd = 5) # Corresponding y-values with some added noise

# Create a data frame from the vectors
data <- data.frame(x, y)

# Generate the scatter plot
ggplot(data, aes(x = x, y = y)) + 
  geom_point(aes(color = x), size = 3, alpha = 0.6) + # Points with color gradient based on x-value
  theme_minimal() +  # Clean theme
  labs(title = "Scatter Plot of X vs Y",
       x = "X Axis Label",
       y = "Y Axis Label") +
  scale_color_gradient(low = "blue", high = "red") + # Color gradient from blue to red
  theme(plot.title = element_text(hjust = 0.5)) # Center the plot title

# Load the ggplot2 package
library(ggplot2)

# Load the mtcars dataset (this step is actually unnecessary as mtcars is built-in)

# Start with the base layer and add layers sequentially
ggplot(mtcars, aes(x = wt, y = mpg)) +  # Combine the data and aes
  geom_point() +  # Add the data layer
  stat_smooth(method = "lm", aes(color = "Regression Line"), se = TRUE, linewidth = 1) +  # Add the statistical transformation layer
  coord_cartesian(xlim = c(1.5, 5.5), ylim = c(10, 35)) +  # Add the coordinate system layer
  scale_x_continuous(breaks = seq(2, 5, by = 1)) +  # Add the scale layer
  scale_y_continuous(breaks = seq(10, 35, by = 5)) +
  scale_color_manual(values = "red", labels = "Linear Model") +  # Assign color to the regression line
  labs(title = "Car Weight vs. Mileage",  # Add labels and title
       x = "Weight (1000 lbs)",
       y = "Miles per Gallon",
       color = "Line Type") +  # Label for legend
  theme_bw() +  # Add the theme layer
  theme(plot.title = element_text(hjust = 0.5),  # Customize theme elements
        axis.title = element_text(face = "bold"))

Scatter plot - Layer by Layer

1. Creating data and aesthetics Layer.

library(ggplot2)
ggplot(mtcars, aes(x = wt, y = mpg))

2. Adding Geometric layer(geom_point)

ggplot(mtcars, aes(x = wt, y = mpg)) +  # Combine the data and aes
  geom_point()   # Add the data layer

3. Adding Statistical Layer - Regression Line

  ggplot(mtcars, aes(x = wt, y = mpg)) +  # Combine the data and aes
  geom_point() +  # Add the data layer
  stat_smooth(method = "lm", aes(color = "Regression Line"), se = TRUE, linewidth = 1)  # Add the statistical transformation layer

4. Adding the Co-ordinate Layer

  ggplot(mtcars, aes(x = wt, y = mpg)) +  # Combine the data and aes
  geom_point() +  # Add the data layer
  stat_smooth(method = "lm", aes(color = "Regression Line"), se = TRUE, linewidth = 1) +  # Add the statistical transformation layer
  coord_cartesian(xlim = c(1.5, 5.5), ylim = c(10, 35))   # Add the coordinate system layer

5. Adding the scale layer

  ggplot(mtcars, aes(x = wt, y = mpg)) +  # Combine the data and aes
  geom_point() +  # Add the data layer
  stat_smooth(method = "lm", aes(color = "Regression Line"), se = TRUE, linewidth = 1) +  # Add the statistical transformation layer
  coord_cartesian(xlim = c(1.5, 5.5), ylim = c(10, 35)) +  # Add the coordinate system layer
  scale_x_continuous(breaks = seq(2, 5, by = 1)) +  # Add the scale layer
  scale_y_continuous(breaks = seq(10, 35, by = 2))

6. Adding color to Regression Line

ggplot(mtcars, aes(x = wt, y = mpg)) +  # Combine the data and aes
  geom_point() +  # Add the data layer
  stat_smooth(method = "lm", aes(color = "Regression Line"), se = TRUE, linewidth = 1) +  # Add the statistical transformation layer
  coord_cartesian(xlim = c(1, 6), ylim = c(5, 35)) +  # Add the coordinate system layer
  scale_x_continuous(breaks = seq(1, 6, by = 1)) +  # Add the scale layer
  scale_y_continuous(breaks = seq(5, 35, by = 2)) +
  scale_color_manual(values = "blue", labels = "Linear Model") # Assign color to the regression line

7. Adding labels and Title

ggplot(mtcars, aes(x = wt, y = mpg)) +  # Combine the data and aes
  geom_point() +  # Add the data layer
  stat_smooth(method = "lm", aes(color = "Regression Line"), se = TRUE, linewidth = 1) +  # Add the statistical transformation layer
  coord_cartesian(xlim = c(1, 6), ylim = c(5, 35)) +  # Add the coordinate system layer
  scale_x_continuous(breaks = seq(1, 6, by = 1)) +  # Add the scale layer
  scale_y_continuous(breaks = seq(5, 35, by = 2)) +
  scale_color_manual(values = "blue", labels = "Linear Model") + # Assign color to the regression line
  labs(title = "Car Weight vs. Mileage",  # Add labels and title
       x = "Weight (1000 lbs)",
       y = "Miles per Gallon",
       color = "Line Type")   # Label for legend

8. Adding Themes

ggplot(mtcars, aes(x = wt, y = mpg)) +  # Combine the data and aes
  geom_point() +  # Add the data layer
  stat_smooth(method = "lm", aes(color = "Regression Line"), se = TRUE, linewidth = 1) +  # Add the statistical transformation layer
  coord_cartesian(xlim = c(1, 6), ylim = c(5, 35)) +  # Add the coordinate system layer
  scale_x_continuous(breaks = seq(1, 6, by = 1)) +  # Add the scale layer
  scale_y_continuous(breaks = seq(5, 35, by = 2)) +
  scale_color_manual(values = "blue", labels = "Linear Model") + # Assign color to the regression line
  labs(title = "Car Weight vs. Mileage",  # Add labels and title
       x = "Weight (1000 lbs)",
       y = "Miles per Gallon",
       color = "Line Type") +  # Label for legend
  theme_bw() +  # Add the theme layer
  theme(plot.title = element_text(hjust = 0.5),  # Customize theme elements
        axis.title = element_text(face = "bold"))